Previous script: “06_get_eigengene_QTL.Rmd”
The goal is to find QTL peaks for the WGCNA eigen genes and see if those overalp with any growth QTL. We are only focusing on eigen genes that correlated with some growth traits/paramters.
library(GenomicRanges)
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel
Attaching package: ‘BiocGenerics’
The following objects are masked from ‘package:parallel’:
clusterApply, clusterApplyLB, clusterCall, clusterEvalQ, clusterExport, clusterMap,
parApply, parCapply, parLapply, parLapplyLB, parRapply, parSapply, parSapplyLB
The following objects are masked from ‘package:stats’:
IQR, mad, sd, var, xtabs
The following objects are masked from ‘package:base’:
anyDuplicated, append, as.data.frame, basename, cbind, colMeans, colnames, colSums,
dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep, grepl, intersect,
is.unsorted, lapply, lengths, Map, mapply, match, mget, order, paste, pmax, pmax.int,
pmin, pmin.int, Position, rank, rbind, Reduce, rowMeans, rownames, rowSums, sapply,
setdiff, sort, table, tapply, union, unique, unsplit, which, which.max, which.min
Loading required package: S4Vectors
Attaching package: ‘S4Vectors’
The following object is masked from ‘package:base’:
expand.grid
Loading required package: IRanges
Loading required package: GenomeInfoDb
library(qtl)
library(tidyverse)
[30m── [1mAttaching packages[22m ───────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──[39m
[30m[32m✔[30m [34mggplot2[30m 3.0.0 [32m✔[30m [34mpurrr [30m 0.2.5
[32m✔[30m [34mtibble [30m 1.4.2 [32m✔[30m [34mdplyr [30m 0.7.6
[32m✔[30m [34mtidyr [30m 0.8.1 [32m✔[30m [34mstringr[30m 1.3.1
[32m✔[30m [34mreadr [30m 1.1.1 [32m✔[30m [34mforcats[30m 0.3.0[39m
[30m── [1mConflicts[22m ──────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[30m [34mdplyr[30m::[32mcollapse()[30m masks [34mIRanges[30m::collapse()
[31m✖[30m [34mdplyr[30m::[32mcombine()[30m masks [34mBiocGenerics[30m::combine()
[31m✖[30m [34mdplyr[30m::[32mdesc()[30m masks [34mIRanges[30m::desc()
[31m✖[30m [34mtidyr[30m::[32mexpand()[30m masks [34mS4Vectors[30m::expand()
[31m✖[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31m✖[30m [34mdplyr[30m::[32mfirst()[30m masks [34mS4Vectors[30m::first()
[31m✖[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()
[31m✖[30m [34mggplot2[30m::[32mPosition()[30m masks [34mBiocGenerics[30m::Position(), [34mbase[30m::Position()
[31m✖[30m [34mpurrr[30m::[32mreduce()[30m masks [34mGenomicRanges[30m::reduce(), [34mIRanges[30m::reduce()
[31m✖[30m [34mdplyr[30m::[32mrename()[30m masks [34mS4Vectors[30m::rename()
[31m✖[30m [34mdplyr[30m::[32mslice()[30m masks [34mIRanges[30m::slice()[39m
library(stringr)
load("../output/scanone-eigengene-qtl_2012.RData")
threshold.95 <- tibble(perm.threshold=lod.thrs[5,],
trait=colnames(lod.thrs))
scanone.gather <- scanone_eigen %>%
gather(key = trait, value = LOD, -chr, -pos) %>%
mutate(condition=str_sub(trait,1,2), color=str_sub(trait,6,100)) %>%
left_join(threshold.95)
Joining, by = "trait"
scanone.gather
pl.UN <- scanone.gather %>% filter(condition=="UN") %>%
ggplot(aes(x=pos,y=LOD)) +
geom_line() +
geom_hline(aes(yintercept=perm.threshold),lty=2,lwd=.5,alpha=.5) +
facet_grid(trait ~ chr, scales="free") +
theme(strip.text.y = element_text(angle=0), axis.text.x = element_text(angle=90)) +
ggtitle("UN Eigen Gene QTL")
pl.UN
ggsave("../output/eigen gene eQTL UN 2012.pdf",width=12,height=8)
For each eigen gene, find QTL borders and look for overlap with growth QTL
For each eigen gene first identify chromosomes with “significant” peaks (in this case > 99% permuation threshold) and then run bayesint() on them to define the intervals
sig.chrs <- scanone.gather %>% filter(LOD > perm.threshold) %>%
group_by(trait,chr) %>%
summarize(unique(chr))
sig.chrs
now for each significant chromosome/trait combo run bayesint
scanone_eigen.phys <- scanone_eigen[!str_detect(rownames(scanone_eigen),"^cA"),]
bayesint.list <- apply(sig.chrs,1,function(hit) {
result <- bayesint(scanone_eigen.phys[c("chr","pos",hit["trait"])],
chr=hit["chr"],
lodcolumn = 1,
expandtomarkers = TRUE
)
colnames(result)[3] <- "LOD"
result
})
names(bayesint.list) <- sig.chrs$trait
bayesint.list <- lapply(bayesint.list,function(x) x %>%
as.data.frame() %>%
rownames_to_column(var="markername") %>%
mutate(chr=as.character(chr))
)
bayesint.result <- as.tibble(bind_rows(bayesint.list,.id="trait")) %>%
select(trait,chr,pos,markername,LOD) %>%
separate(markername,into=c("chr1","Mbp"),sep="x", convert=TRUE) %>%
group_by(trait,chr) %>%
summarize(start=min(Mbp),end=max(Mbp),min_eQTL_LOD=min(LOD),max_eQTL_LOD=max(LOD)) %>%
#for the high QTL peaks the interval width is 0. That is overly precise and need to widen those.
mutate(start=ifelse(start==end,max(0,start-20000),start), end=ifelse(start==end,end+20000,end))
bayesint.result
Load annotation
BrapaAnnotation <- read_csv("../input/Brapa_V1.5_annotated.csv")
Missing column names filled in: 'X1' [1]Parsed with column specification:
cols(
X1 = col_integer(),
name = col_character(),
chrom = col_character(),
start = col_integer(),
end = col_integer(),
subject = col_character(),
AGI = col_character(),
At_symbol = col_character(),
At_description = col_character(),
perc_ID = col_double(),
aln_length = col_integer(),
mismatch = col_integer(),
gap_open = col_integer(),
qstart = col_integer(),
qend = col_integer(),
sstart = col_integer(),
send = col_integer(),
eval = col_double(),
score = col_double()
)
BrapaAnnotation
eigen.annotated <- lapply(1:nrow(bayesint.result),function(row) {
qtl <- bayesint.result[row,]
results <- subset(BrapaAnnotation, chrom==qtl$chr &
start >= qtl$start &
end <= qtl$end)
}
)
names(eigen.annotated) <- bayesint.result$trait
eigen.annotated <- bind_rows(eigen.annotated,.id="trait") %>%
mutate(chrom=as.character(chrom)) %>%
left_join(bayesint.result,by=c("trait","chrom"="chr")) %>% #get eQTL LOD
rename(eigen_eQTL_candidate=name)
eigen.annotated.small <- eigen.annotated %>% select(trait,eigen_eQTL_candidate,ends_with("LOD"))
eigen.annotated.small
given bayesint results, find overlaps with UN growth QTL
filepath <- "../input/All2012HeightQTL2.xlsx"
filebase <- filepath %>% basename() %>% str_replace("\\..*$","")
QTLgenes <- readxl::read_excel(filepath)[,-1]
QTLgenes <- QTLgenes %>% dplyr::rename(.id=QTL, FVTtrait=FVT) # change names to match previous file
QTLgenes <- QTLgenes %>% filter(str_detect(FVTtrait,"^UN"))
QTLgenes
eigen.qtl.combined <- inner_join(eigen.annotated.small,QTLgenes,by=c("eigen_eQTL_candidate"="name")) %>%
select(.id, trait, everything())
eigen.qtl.combined
how many QTL have at least some overlap?
sort(unique(QTLgenes$.id))
[1] "QTL1" "QTL12" "QTL13" "QTL14" "QTL15" "QTL16" "QTL17" "QTL18" "QTL19" "QTL2" "QTL3" "QTL33"
[13] "QTL34" "QTL35" "QTL6" "QTL7"
sort(unique(eigen.qtl.combined$.id))
[1] "QTL1" "QTL13" "QTL14" "QTL19" "QTL3" "QTL35" "QTL6" "QTL7"
three of four
are all eigen genes overlapping?
unique(eigen.annotated.small$trait)
[1] "UN_MEblue" "UN_MEbrown" "UN_MEcyan" "UN_MEdarkslateblue"
[5] "UN_MElightgreen" "UN_MEmidnightblue" "UN_MEpurple" "UN_MEsteelblue"
[9] "UN_MEturquoise" "UN_MEyellow" "UN_MEyellowgreen"
unique(eigen.qtl.combined$trait)
[1] "UN_MEblue" "UN_MEbrown" "UN_MEcyan" "UN_MEdarkslateblue"
[5] "UN_MEmidnightblue" "UN_MEpurple" "UN_MEturquoise" "UN_MEyellow"
[9] "UN_MEyellowgreen"
No, 7 of 11
write_csv(eigen.qtl.combined,
path=str_c("../output/", filebase, "_eigenQTL_overlap_", Sys.Date(), ".csv"))
first convert things to ranges
qtl.info <- QTLgenes %>%
group_by(.id) %>%
summarize(chrom=unique(chrom),start=min(start),end=max(end))
qtl.info
qtl.ranges <- GRanges(seqnames = qtl.info$chrom,ranges=IRanges(start=qtl.info$start,end=qtl.info$end))
qtl.ranges <- GenomicRanges::reduce(qtl.ranges)
eQTL.ranges <- GRanges(bayesint.result$chr,
ranges = IRanges(start=bayesint.result$start,
end=bayesint.result$end))
eQTL.ranges <- GenomicRanges::reduce(eQTL.ranges)
Make table of chromosome info
chr.info <- scanone_eigen.phys %>%
as.data.frame() %>%
rownames_to_column("marker") %>%
select(marker) %>%
separate(marker,into=c("chr","bp"),sep="x",convert=TRUE) %>%
group_by(chr) %>%
summarize(start=min(bp),end=max(bp))
Do the simulations
sims <- 1000
set.seed(2323)
sim.results <- sapply(1:sims, function(s) {
if (s %% 100 == 0) print(s)
sim.eQTL <- tibble(
chr=sample(chr.info$chr,
size = length(eQTL.ranges),
replace = TRUE,
prob=chr.info$end/sum(chr.info$end)),
width=width(eQTL.ranges) # width of the QTL to simulate
)
sim.eQTL <- chr.info %>%
select(chr,chr.start=start,chr.end=end) %>% right_join(sim.eQTL,by="chr") #need to get the chrom end so we can sample correctly
sim.eQTL <- sim.eQTL %>% mutate(qtl.start = runif(n=n(),
min = chr.start,
max= max(chr.start,chr.end-width)),
qtl.end=qtl.start+width)
sim.eQTL.ranges <- GRanges(seqnames = sim.eQTL$chr,
ranges = IRanges(start=sim.eQTL$qtl.start,
end=sim.eQTL$qtl.end))
suppressWarnings(result <- sum(countOverlaps(qtl.ranges,sim.eQTL.ranges)>0))
result
})
[1] 100
[1] 200
[1] 300
[1] 400
[1] 500
[1] 600
[1] 700
[1] 800
[1] 900
[1] 1000
true.overlap <- sum(countOverlaps(qtl.ranges,eQTL.ranges)) #OK to ignore warnings
Each of the 2 combined objects has sequence levels not in the other:
- in 'x': A09, A05
- in 'y': A08
Make sure to always combine/compare objects based on the same reference
genome (use suppressWarnings() to suppress this warning).
true.overlap
[1] 4
mean(sim.results >= true.overlap)
[1] 0.065
tibble(FVTQTL_vs_MReQTL_True_Overlaps=true.overlap,
N_Simulations_fewer_overlaps=sum(sim.results < true.overlap),
N_Simulations_greater_equal_overlaps=sum(sim.results >= true.overlap),
P_value=mean(sim.results >= true.overlap)
) %>%
write_csv(str_c("../output/", filebase, "_WGCNA_eigen_eQTL_scanone_overlap_pval_", Sys.Date(), ".csv"))
threshold.95 <- tibble(perm.threshold=lod.thrs.cim[5,],
trait=colnames(lod.thrs.cim))
scanone.gather <- scanone_eigen_cim %>%
gather(key = trait, value = LOD, -chr, -pos) %>%
mutate(condition=str_sub(trait,1,2), color=str_sub(trait,6,100)) %>%
left_join(threshold.95)
Joining, by = "trait"
scanone.gather
pl.UN <- scanone.gather %>% filter(condition=="UN") %>%
ggplot(aes(x=pos,y=LOD)) +
geom_line() +
geom_hline(aes(yintercept=perm.threshold),lty=2,lwd=.5,alpha=.5) +
facet_grid(trait ~ chr, scales="free") +
theme(strip.text.y = element_text(angle=0), axis.text.x = element_text(angle=90)) +
ggtitle("UN Eigen Gene QTL")
pl.UN
ggsave("../output/eigen gene eQTL UN CIM 2012.pdf",width=12,height=8)
For each eigen gene, find QTL borders and look for overlap with growth QTL
For each eigen gene first identify chromosomes with “significant” peaks (in this case > 99% permuation threshold) and then runs bayesint() on them to define the intervals
sig.chrs <- scanone.gather %>% filter(LOD > perm.threshold) %>%
group_by(trait,chr) %>%
summarize(unique(chr))
sig.chrs
now for each significant chromosome/trait combo run bayesint
#remove markers without physical position
scanone_eigen_cim.phys <- scanone_eigen_cim[!str_detect(rownames(scanone_eigen),"^cA"),]
bayesint.list <- apply(sig.chrs,1,function(hit) {
result <- bayesint(scanone_eigen_cim.phys[c("chr","pos",hit["trait"])],
chr=hit["chr"],
lodcolumn = 1,
expandtomarkers = TRUE
)
colnames(result)[3] <- "LOD"
result
})
names(bayesint.list) <- sig.chrs$trait
bayesint.list <- lapply(bayesint.list,function(x) x %>%
as.data.frame() %>%
rownames_to_column(var="markername") %>%
mutate(chr=as.character(chr))
)
bayesint.result <- as.tibble(bind_rows(bayesint.list,.id="trait")) %>%
select(trait,chr,pos,markername,LOD) %>%
separate(markername,into=c("chr1","Mbp"),sep="x", convert=TRUE) %>%
group_by(trait,chr) %>%
summarize(start=min(Mbp),end=max(Mbp),min_eQTL_LOD=min(LOD),max_eQTL_LOD=max(LOD)) %>%
#for the high QTL peaks the interval width is 0. That is overly precise and need to widen those.
mutate(start=ifelse(start==end,max(0,start-20000),start), end=ifelse(start==end,end+20000,end))
bayesint.result
Load annotation
BrapaAnnotation <- read_csv("../input/Brapa_V1.5_annotated.csv")
Missing column names filled in: 'X1' [1]Parsed with column specification:
cols(
X1 = col_integer(),
name = col_character(),
chrom = col_character(),
start = col_integer(),
end = col_integer(),
subject = col_character(),
AGI = col_character(),
At_symbol = col_character(),
At_description = col_character(),
perc_ID = col_double(),
aln_length = col_integer(),
mismatch = col_integer(),
gap_open = col_integer(),
qstart = col_integer(),
qend = col_integer(),
sstart = col_integer(),
send = col_integer(),
eval = col_double(),
score = col_double()
)
|============ | 13%
|============= | 14% 1 MB
|============== | 15% 1 MB
|=============== | 16% 1 MB
|================ | 17% 1 MB
|================= | 19% 1 MB
|================== | 20% 1 MB
|=================== | 21% 1 MB
|==================== | 22% 1 MB
|===================== | 24% 1 MB
|======================= | 25% 1 MB
|======================== | 26% 1 MB
|========================= | 27% 2 MB
|========================== | 29% 2 MB
|=========================== | 30% 2 MB
|============================ | 31% 2 MB
|============================= | 32% 2 MB
|=============================== | 34% 2 MB
|================================ | 35% 2 MB
|================================= | 36% 2 MB
|================================== | 37% 2 MB
|=================================== | 38% 2 MB
|==================================== | 40% 2 MB
|===================================== | 41% 3 MB
|====================================== | 42% 3 MB
|======================================= | 43% 3 MB
|======================================== | 44% 3 MB
|========================================== | 46% 3 MB
|=========================================== | 47% 3 MB
|============================================ | 48% 3 MB
|============================================= | 49% 3 MB
|============================================== | 51% 3 MB
|=============================================== | 52% 3 MB
|================================================ | 53% 3 MB
|================================================= | 54% 4 MB
|================================================== | 55% 4 MB
|=================================================== | 57% 4 MB
|===================================================== | 58% 4 MB
|====================================================== | 59% 4 MB
|======================================================= | 60% 4 MB
|======================================================== | 61% 4 MB
|========================================================= | 62% 4 MB
|========================================================== | 64% 4 MB
|=========================================================== | 65% 4 MB
|============================================================ | 66% 4 MB
|============================================================= | 67% 5 MB
|============================================================== | 69% 5 MB
|=============================================================== | 70% 5 MB
|================================================================ | 71% 5 MB
|================================================================== | 72% 5 MB
|=================================================================== | 73% 5 MB
|==================================================================== | 75% 5 MB
|===================================================================== | 76% 5 MB
|====================================================================== | 77% 5 MB
|======================================================================= | 78% 5 MB
|======================================================================== | 80% 5 MB
|========================================================================= | 81% 6 MB
|========================================================================== | 82% 6 MB
|============================================================================ | 83% 6 MB
|============================================================================= | 84% 6 MB
|============================================================================== | 86% 6 MB
|=============================================================================== | 87% 6 MB
|================================================================================ | 88% 6 MB
|================================================================================= | 89% 6 MB
|================================================================================== | 90% 6 MB
|=================================================================================== | 92% 6 MB
|===================================================================================== | 93% 6 MB
|====================================================================================== | 94% 7 MB
|======================================================================================= | 95% 7 MB
|======================================================================================== | 97% 7 MB
|========================================================================================= | 98% 7 MB
|==========================================================================================| 99% 7 MB
|===========================================================================================| 100% 7 MB
BrapaAnnotation
eigen.annotated <- lapply(1:nrow(bayesint.result),function(row) {
qtl <- bayesint.result[row,]
results <- subset(BrapaAnnotation, chrom==qtl$chr &
start >= qtl$start &
end <= qtl$end)
}
)
names(eigen.annotated) <- bayesint.result$trait
eigen.annotated <- bind_rows(eigen.annotated,.id="trait") %>%
mutate(chrom=as.character(chrom)) %>%
left_join(bayesint.result,by=c("trait","chrom"="chr")) %>% #get eQTL LOD
rename(eigen_eQTL_candidate=name)
eigen.annotated.small <- eigen.annotated %>% select(trait,eigen_eQTL_candidate,ends_with("LOD"))
eigen.annotated.small
given bayesint results, find overlaps with UN growth QTL
filepath <- "../input/All2012HeightQTL2.xlsx"
filebase <- filepath %>% basename() %>% str_replace("\\..*$","")
QTLgenes <- readxl::read_excel(filepath)[,-1]
QTLgenes <- QTLgenes %>% dplyr::rename(.id=QTL, FVTtrait=FVT) # change names to match previous file
QTLgenes <- QTLgenes %>% filter(str_detect(FVTtrait,"^UN"))
QTLgenes
eigen.qtl.combined <- inner_join(eigen.annotated.small,QTLgenes,by=c("eigen_eQTL_candidate"="name")) %>%
select(.id, trait, everything())
eigen.qtl.combined
how many QTL have at least some overlap?
unique(QTLgenes$.id)
[1] "QTL1" "QTL12" "QTL13" "QTL14" "QTL15" "QTL16" "QTL17" "QTL18" "QTL19" "QTL2" "QTL3" "QTL33"
[13] "QTL34" "QTL35" "QTL6" "QTL7"
unique(eigen.qtl.combined$.id)
[1] "QTL3" "QTL7" "QTL19" "QTL1" "QTL13" "QTL6"
two of four
are all eigen genes overlapping?
unique(eigen.annotated.small$trait)
[1] "UN_MEblue" "UN_MEbrown" "UN_MEcyan" "UN_MEdarkslateblue"
[5] "UN_MElightgreen" "UN_MEmidnightblue" "UN_MEsteelblue" "UN_MEturquoise"
[9] "UN_MEyellowgreen"
unique(eigen.qtl.combined$trait)
[1] "UN_MEbrown" "UN_MEcyan" "UN_MEdarkslateblue" "UN_MEmidnightblue"
[5] "UN_MEturquoise" "UN_MEyellowgreen"
No, 2
write_csv(eigen.qtl.combined,
path=str_c("../output/", filebase, "_eigenQTL_overlap_CIM_", Sys.Date(), ".csv"))
eQTL.ranges <- GRanges(bayesint.result$chr,
ranges = IRanges(start=bayesint.result$start,
end=bayesint.result$end))
eQTL.ranges <- GenomicRanges::reduce(eQTL.ranges)
Do the simulations
sims <- 1000
set.seed(4545)
sim.results <- sapply(1:sims, function(s) {
if (s %% 100 == 0) print(s)
sim.eQTL <- tibble(
chr=sample(chr.info$chr,
size = length(eQTL.ranges),
replace = TRUE,
prob=chr.info$end/sum(chr.info$end)),
width=width(eQTL.ranges) # width of the QTL to simulate
)
sim.eQTL <- chr.info %>%
select(chr,chr.start=start,chr.end=end) %>% right_join(sim.eQTL,by="chr") #need to get the chrom end so we can sample correctly
sim.eQTL <- sim.eQTL %>% mutate(qtl.start = runif(n=n(),
min = chr.start,
max= max(chr.start,chr.end-width)),
qtl.end=qtl.start+width)
sim.eQTL.ranges <- GRanges(seqnames = sim.eQTL$chr,
ranges = IRanges(start=sim.eQTL$qtl.start,
end=sim.eQTL$qtl.end))
suppressWarnings(result <- sum(countOverlaps(qtl.ranges,sim.eQTL.ranges)>0))
result
})
[1] 100
[1] 200
[1] 300
[1] 400
[1] 500
[1] 600
[1] 700
[1] 800
[1] 900
[1] 1000
true.overlap <- sum(countOverlaps(qtl.ranges,eQTL.ranges)) #OK to ignore warnings
true.overlap
[1] 4
mean(sim.results >= true.overlap)
[1] 0.005
tibble(FVTQTL_vs_MReQTL_True_Overlaps=true.overlap,
N_Simulations_fewer_overlaps=sum(sim.results < true.overlap),
N_Simulations_greater_equal_overlaps=sum(sim.results >= true.overlap),
P_value=mean(sim.results >= true.overlap)
) %>%
write_csv(str_c("../output/", filebase, "_WGCNA_eigen_eQTL_CIM_overlap_pval_", Sys.Date(), ".csv"))